| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 500, |
| "global_step": 790, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03164556962025317, |
| "grad_norm": 5.788066387176514, |
| "learning_rate": 0.0002, |
| "loss": 2.6104, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 1.5542352199554443, |
| "learning_rate": 0.00019872773536895675, |
| "loss": 0.7864, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0949367088607595, |
| "grad_norm": 1.996996283531189, |
| "learning_rate": 0.00019745547073791352, |
| "loss": 0.5532, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 0.9193770885467529, |
| "learning_rate": 0.00019618320610687023, |
| "loss": 0.522, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.15822784810126583, |
| "grad_norm": 1.024322271347046, |
| "learning_rate": 0.00019491094147582698, |
| "loss": 0.5621, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 0.873715341091156, |
| "learning_rate": 0.00019363867684478372, |
| "loss": 0.4996, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.22151898734177214, |
| "grad_norm": 0.8645951151847839, |
| "learning_rate": 0.00019236641221374049, |
| "loss": 0.502, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 0.7674330472946167, |
| "learning_rate": 0.00019109414758269723, |
| "loss": 0.433, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2848101265822785, |
| "grad_norm": 0.7591924667358398, |
| "learning_rate": 0.00018982188295165394, |
| "loss": 0.4406, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.31645569620253167, |
| "grad_norm": 0.7595440745353699, |
| "learning_rate": 0.00018854961832061068, |
| "loss": 0.499, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.34810126582278483, |
| "grad_norm": 0.7064336538314819, |
| "learning_rate": 0.00018727735368956745, |
| "loss": 0.4756, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.379746835443038, |
| "grad_norm": 0.7232657670974731, |
| "learning_rate": 0.0001860050890585242, |
| "loss": 0.5227, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.41139240506329117, |
| "grad_norm": 0.7500166296958923, |
| "learning_rate": 0.00018473282442748093, |
| "loss": 0.4481, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.4430379746835443, |
| "grad_norm": 0.6161800026893616, |
| "learning_rate": 0.00018346055979643765, |
| "loss": 0.4868, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.47468354430379744, |
| "grad_norm": 0.7168012857437134, |
| "learning_rate": 0.00018218829516539442, |
| "loss": 0.4833, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 0.7813606262207031, |
| "learning_rate": 0.00018091603053435116, |
| "loss": 0.4777, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5379746835443038, |
| "grad_norm": 0.7056337594985962, |
| "learning_rate": 0.0001796437659033079, |
| "loss": 0.4521, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.569620253164557, |
| "grad_norm": 0.570087730884552, |
| "learning_rate": 0.00017837150127226464, |
| "loss": 0.4456, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6012658227848101, |
| "grad_norm": 0.642938494682312, |
| "learning_rate": 0.00017709923664122138, |
| "loss": 0.4169, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6329113924050633, |
| "grad_norm": 0.7025493383407593, |
| "learning_rate": 0.00017582697201017812, |
| "loss": 0.4987, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6645569620253164, |
| "grad_norm": 0.7466819882392883, |
| "learning_rate": 0.00017455470737913486, |
| "loss": 0.4644, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.6962025316455697, |
| "grad_norm": 0.7106885313987732, |
| "learning_rate": 0.00017328244274809163, |
| "loss": 0.4653, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7278481012658228, |
| "grad_norm": 0.6158185601234436, |
| "learning_rate": 0.00017201017811704835, |
| "loss": 0.4556, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 0.678554117679596, |
| "learning_rate": 0.0001707379134860051, |
| "loss": 0.4587, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7911392405063291, |
| "grad_norm": 0.8016729354858398, |
| "learning_rate": 0.00016946564885496183, |
| "loss": 0.4228, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8227848101265823, |
| "grad_norm": 0.7110231518745422, |
| "learning_rate": 0.0001681933842239186, |
| "loss": 0.4303, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.8544303797468354, |
| "grad_norm": 0.6997452974319458, |
| "learning_rate": 0.00016692111959287534, |
| "loss": 0.4342, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.8860759493670886, |
| "grad_norm": 0.6250122785568237, |
| "learning_rate": 0.00016564885496183205, |
| "loss": 0.427, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.9177215189873418, |
| "grad_norm": 0.6947687864303589, |
| "learning_rate": 0.0001643765903307888, |
| "loss": 0.4763, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9493670886075949, |
| "grad_norm": 0.680385947227478, |
| "learning_rate": 0.00016310432569974556, |
| "loss": 0.4554, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9810126582278481, |
| "grad_norm": 0.5412645936012268, |
| "learning_rate": 0.0001618320610687023, |
| "loss": 0.4664, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 0.5828943848609924, |
| "learning_rate": 0.00016055979643765905, |
| "loss": 0.4573, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.0443037974683544, |
| "grad_norm": 0.6317119002342224, |
| "learning_rate": 0.0001592875318066158, |
| "loss": 0.325, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.0759493670886076, |
| "grad_norm": 0.6031287312507629, |
| "learning_rate": 0.00015801526717557253, |
| "loss": 0.3658, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1075949367088607, |
| "grad_norm": 0.6438406109809875, |
| "learning_rate": 0.00015674300254452927, |
| "loss": 0.3645, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.139240506329114, |
| "grad_norm": 0.6503311395645142, |
| "learning_rate": 0.000155470737913486, |
| "loss": 0.323, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.1708860759493671, |
| "grad_norm": 0.8476307392120361, |
| "learning_rate": 0.00015419847328244275, |
| "loss": 0.347, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2025316455696202, |
| "grad_norm": 0.7285150289535522, |
| "learning_rate": 0.0001529262086513995, |
| "loss": 0.3831, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.2341772151898733, |
| "grad_norm": 0.6327723860740662, |
| "learning_rate": 0.00015165394402035624, |
| "loss": 0.3577, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.2658227848101267, |
| "grad_norm": 0.6771088242530823, |
| "learning_rate": 0.00015038167938931298, |
| "loss": 0.3154, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.2974683544303798, |
| "grad_norm": 0.7355062365531921, |
| "learning_rate": 0.00014910941475826972, |
| "loss": 0.3706, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.3291139240506329, |
| "grad_norm": 0.7912581562995911, |
| "learning_rate": 0.0001478371501272265, |
| "loss": 0.3456, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.360759493670886, |
| "grad_norm": 0.6501379609107971, |
| "learning_rate": 0.0001465648854961832, |
| "loss": 0.3243, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.3924050632911391, |
| "grad_norm": 0.6570438146591187, |
| "learning_rate": 0.00014529262086513994, |
| "loss": 0.3595, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.4240506329113924, |
| "grad_norm": 0.6073997020721436, |
| "learning_rate": 0.0001440203562340967, |
| "loss": 0.3397, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.4556962025316456, |
| "grad_norm": 0.7310261130332947, |
| "learning_rate": 0.00014274809160305345, |
| "loss": 0.3641, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.4873417721518987, |
| "grad_norm": 0.8089779019355774, |
| "learning_rate": 0.0001414758269720102, |
| "loss": 0.3374, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 0.8505273461341858, |
| "learning_rate": 0.0001402035623409669, |
| "loss": 0.3695, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.5506329113924051, |
| "grad_norm": 0.6972491145133972, |
| "learning_rate": 0.00013893129770992368, |
| "loss": 0.3556, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.5822784810126582, |
| "grad_norm": 0.740247368812561, |
| "learning_rate": 0.00013765903307888042, |
| "loss": 0.3604, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6139240506329116, |
| "grad_norm": 0.818209707736969, |
| "learning_rate": 0.00013638676844783716, |
| "loss": 0.3538, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.6455696202531644, |
| "grad_norm": 0.822881817817688, |
| "learning_rate": 0.0001351145038167939, |
| "loss": 0.3494, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.6772151898734178, |
| "grad_norm": 0.7193669080734253, |
| "learning_rate": 0.00013384223918575064, |
| "loss": 0.3676, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.7088607594936709, |
| "grad_norm": 0.6926146149635315, |
| "learning_rate": 0.00013256997455470738, |
| "loss": 0.313, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.740506329113924, |
| "grad_norm": 1.0953829288482666, |
| "learning_rate": 0.00013129770992366413, |
| "loss": 0.3202, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.7721518987341773, |
| "grad_norm": 0.8663277626037598, |
| "learning_rate": 0.00013002544529262087, |
| "loss": 0.3488, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.8037974683544302, |
| "grad_norm": 1.1026146411895752, |
| "learning_rate": 0.0001287531806615776, |
| "loss": 0.3654, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.8354430379746836, |
| "grad_norm": 0.7661195993423462, |
| "learning_rate": 0.00012748091603053435, |
| "loss": 0.3693, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.8670886075949367, |
| "grad_norm": 0.6808319687843323, |
| "learning_rate": 0.0001262086513994911, |
| "loss": 0.3449, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.8987341772151898, |
| "grad_norm": 0.7904935479164124, |
| "learning_rate": 0.00012493638676844783, |
| "loss": 0.3219, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9303797468354431, |
| "grad_norm": 0.7428227066993713, |
| "learning_rate": 0.0001236641221374046, |
| "loss": 0.3452, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.9620253164556962, |
| "grad_norm": 0.8595893383026123, |
| "learning_rate": 0.00012239185750636134, |
| "loss": 0.3629, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.9936708860759493, |
| "grad_norm": 0.7588908672332764, |
| "learning_rate": 0.00012111959287531807, |
| "loss": 0.333, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 0.6112965941429138, |
| "learning_rate": 0.00011984732824427483, |
| "loss": 0.2809, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.0569620253164556, |
| "grad_norm": 1.506441593170166, |
| "learning_rate": 0.00011857506361323157, |
| "loss": 0.2251, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.088607594936709, |
| "grad_norm": 0.8897147178649902, |
| "learning_rate": 0.0001173027989821883, |
| "loss": 0.2258, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.1202531645569622, |
| "grad_norm": 0.6773934960365295, |
| "learning_rate": 0.00011603053435114504, |
| "loss": 0.2115, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.151898734177215, |
| "grad_norm": 0.832305908203125, |
| "learning_rate": 0.00011475826972010179, |
| "loss": 0.2317, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.1835443037974684, |
| "grad_norm": 0.9453684687614441, |
| "learning_rate": 0.00011348600508905853, |
| "loss": 0.2325, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.2151898734177213, |
| "grad_norm": 0.9567768573760986, |
| "learning_rate": 0.00011221374045801527, |
| "loss": 0.2273, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.2468354430379747, |
| "grad_norm": 0.8501098155975342, |
| "learning_rate": 0.000110941475826972, |
| "loss": 0.2184, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.278481012658228, |
| "grad_norm": 1.046438217163086, |
| "learning_rate": 0.00010966921119592877, |
| "loss": 0.2181, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.310126582278481, |
| "grad_norm": 0.8483916521072388, |
| "learning_rate": 0.0001083969465648855, |
| "loss": 0.2176, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.3417721518987342, |
| "grad_norm": 0.805766224861145, |
| "learning_rate": 0.00010712468193384224, |
| "loss": 0.2303, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.3734177215189876, |
| "grad_norm": 0.8078694343566895, |
| "learning_rate": 0.00010585241730279898, |
| "loss": 0.2199, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.4050632911392404, |
| "grad_norm": 1.255946397781372, |
| "learning_rate": 0.00010458015267175574, |
| "loss": 0.234, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.4367088607594938, |
| "grad_norm": 0.8427215814590454, |
| "learning_rate": 0.00010330788804071248, |
| "loss": 0.2299, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.4683544303797467, |
| "grad_norm": 0.6670682430267334, |
| "learning_rate": 0.00010203562340966922, |
| "loss": 0.2237, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.7526563405990601, |
| "learning_rate": 0.00010076335877862595, |
| "loss": 0.2276, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 0.8490801453590393, |
| "learning_rate": 9.94910941475827e-05, |
| "loss": 0.223, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.5632911392405062, |
| "grad_norm": 0.8474723100662231, |
| "learning_rate": 9.821882951653944e-05, |
| "loss": 0.23, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.5949367088607596, |
| "grad_norm": 0.9560302495956421, |
| "learning_rate": 9.694656488549618e-05, |
| "loss": 0.2254, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.6265822784810124, |
| "grad_norm": 0.8522864580154419, |
| "learning_rate": 9.567430025445293e-05, |
| "loss": 0.2304, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.6582278481012658, |
| "grad_norm": 0.7552340030670166, |
| "learning_rate": 9.440203562340968e-05, |
| "loss": 0.2223, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.689873417721519, |
| "grad_norm": 0.9277474284172058, |
| "learning_rate": 9.312977099236642e-05, |
| "loss": 0.232, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.721518987341772, |
| "grad_norm": 0.9359229803085327, |
| "learning_rate": 9.185750636132316e-05, |
| "loss": 0.2278, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.7531645569620253, |
| "grad_norm": 0.8999188542366028, |
| "learning_rate": 9.05852417302799e-05, |
| "loss": 0.2101, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.7848101265822782, |
| "grad_norm": 1.0271995067596436, |
| "learning_rate": 8.931297709923665e-05, |
| "loss": 0.2355, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.8164556962025316, |
| "grad_norm": 0.8051729798316956, |
| "learning_rate": 8.804071246819339e-05, |
| "loss": 0.2321, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.848101265822785, |
| "grad_norm": 0.9024167656898499, |
| "learning_rate": 8.676844783715013e-05, |
| "loss": 0.2401, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.879746835443038, |
| "grad_norm": 0.9658450484275818, |
| "learning_rate": 8.549618320610687e-05, |
| "loss": 0.2454, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.911392405063291, |
| "grad_norm": 0.766612708568573, |
| "learning_rate": 8.422391857506363e-05, |
| "loss": 0.2391, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.9430379746835444, |
| "grad_norm": 0.7812886238098145, |
| "learning_rate": 8.295165394402035e-05, |
| "loss": 0.2286, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.9746835443037973, |
| "grad_norm": 0.8621058464050293, |
| "learning_rate": 8.167938931297711e-05, |
| "loss": 0.2444, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.0063291139240507, |
| "grad_norm": 0.6189867854118347, |
| "learning_rate": 8.040712468193385e-05, |
| "loss": 0.2114, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.037974683544304, |
| "grad_norm": 0.8174425959587097, |
| "learning_rate": 7.913486005089059e-05, |
| "loss": 0.1325, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.069620253164557, |
| "grad_norm": 1.3215669393539429, |
| "learning_rate": 7.786259541984733e-05, |
| "loss": 0.1431, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.1012658227848102, |
| "grad_norm": 0.944607138633728, |
| "learning_rate": 7.659033078880407e-05, |
| "loss": 0.1334, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.132911392405063, |
| "grad_norm": 0.8664195537567139, |
| "learning_rate": 7.531806615776081e-05, |
| "loss": 0.1358, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.1645569620253164, |
| "grad_norm": 0.7892764806747437, |
| "learning_rate": 7.404580152671756e-05, |
| "loss": 0.124, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.1962025316455698, |
| "grad_norm": 0.9069737195968628, |
| "learning_rate": 7.27735368956743e-05, |
| "loss": 0.1388, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.2278481012658227, |
| "grad_norm": 0.9595538973808289, |
| "learning_rate": 7.150127226463105e-05, |
| "loss": 0.1305, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.259493670886076, |
| "grad_norm": 0.9580609798431396, |
| "learning_rate": 7.022900763358778e-05, |
| "loss": 0.139, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.291139240506329, |
| "grad_norm": 0.8919333219528198, |
| "learning_rate": 6.895674300254454e-05, |
| "loss": 0.1292, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.3227848101265822, |
| "grad_norm": 0.9798533320426941, |
| "learning_rate": 6.768447837150128e-05, |
| "loss": 0.1348, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.3544303797468356, |
| "grad_norm": 0.737899661064148, |
| "learning_rate": 6.641221374045802e-05, |
| "loss": 0.1418, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.3860759493670884, |
| "grad_norm": 0.8077306151390076, |
| "learning_rate": 6.513994910941476e-05, |
| "loss": 0.1434, |
| "step": 535 |
| }, |
| { |
| "epoch": 3.4177215189873418, |
| "grad_norm": 0.6728256940841675, |
| "learning_rate": 6.38676844783715e-05, |
| "loss": 0.1347, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.449367088607595, |
| "grad_norm": 0.8441898822784424, |
| "learning_rate": 6.259541984732826e-05, |
| "loss": 0.1294, |
| "step": 545 |
| }, |
| { |
| "epoch": 3.481012658227848, |
| "grad_norm": 0.7539904713630676, |
| "learning_rate": 6.132315521628498e-05, |
| "loss": 0.1337, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.5126582278481013, |
| "grad_norm": 0.874884843826294, |
| "learning_rate": 6.005089058524174e-05, |
| "loss": 0.1318, |
| "step": 555 |
| }, |
| { |
| "epoch": 3.5443037974683547, |
| "grad_norm": 0.8220652937889099, |
| "learning_rate": 5.877862595419847e-05, |
| "loss": 0.1372, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.5759493670886076, |
| "grad_norm": 0.8709121942520142, |
| "learning_rate": 5.750636132315522e-05, |
| "loss": 0.1329, |
| "step": 565 |
| }, |
| { |
| "epoch": 3.607594936708861, |
| "grad_norm": 1.0847886800765991, |
| "learning_rate": 5.6234096692111956e-05, |
| "loss": 0.1366, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.6392405063291138, |
| "grad_norm": 1.150924563407898, |
| "learning_rate": 5.496183206106871e-05, |
| "loss": 0.1401, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.670886075949367, |
| "grad_norm": 1.2749351263046265, |
| "learning_rate": 5.3689567430025446e-05, |
| "loss": 0.1388, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.7025316455696204, |
| "grad_norm": 0.8032536506652832, |
| "learning_rate": 5.2417302798982194e-05, |
| "loss": 0.1312, |
| "step": 585 |
| }, |
| { |
| "epoch": 3.7341772151898733, |
| "grad_norm": 1.0450551509857178, |
| "learning_rate": 5.114503816793893e-05, |
| "loss": 0.1444, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.7658227848101267, |
| "grad_norm": 0.8416706919670105, |
| "learning_rate": 4.9872773536895677e-05, |
| "loss": 0.1401, |
| "step": 595 |
| }, |
| { |
| "epoch": 3.7974683544303796, |
| "grad_norm": 0.9472242593765259, |
| "learning_rate": 4.860050890585242e-05, |
| "loss": 0.1414, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.829113924050633, |
| "grad_norm": 1.0049540996551514, |
| "learning_rate": 4.7328244274809166e-05, |
| "loss": 0.142, |
| "step": 605 |
| }, |
| { |
| "epoch": 3.8607594936708862, |
| "grad_norm": 0.9480180144309998, |
| "learning_rate": 4.605597964376591e-05, |
| "loss": 0.1416, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.892405063291139, |
| "grad_norm": 1.5082101821899414, |
| "learning_rate": 4.478371501272265e-05, |
| "loss": 0.1327, |
| "step": 615 |
| }, |
| { |
| "epoch": 3.9240506329113924, |
| "grad_norm": 0.7728102207183838, |
| "learning_rate": 4.351145038167939e-05, |
| "loss": 0.1303, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.9556962025316453, |
| "grad_norm": 0.8425063490867615, |
| "learning_rate": 4.223918575063613e-05, |
| "loss": 0.1324, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.9873417721518987, |
| "grad_norm": 0.9874700307846069, |
| "learning_rate": 4.096692111959288e-05, |
| "loss": 0.1405, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.018987341772152, |
| "grad_norm": 0.47495508193969727, |
| "learning_rate": 3.969465648854962e-05, |
| "loss": 0.1071, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.050632911392405, |
| "grad_norm": 0.3872312009334564, |
| "learning_rate": 3.842239185750636e-05, |
| "loss": 0.0901, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.082278481012658, |
| "grad_norm": 0.5231612920761108, |
| "learning_rate": 3.7150127226463104e-05, |
| "loss": 0.0969, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.113924050632911, |
| "grad_norm": 1.3609975576400757, |
| "learning_rate": 3.5877862595419845e-05, |
| "loss": 0.0893, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.1455696202531644, |
| "grad_norm": 0.673308253288269, |
| "learning_rate": 3.4605597964376594e-05, |
| "loss": 0.0928, |
| "step": 655 |
| }, |
| { |
| "epoch": 4.177215189873418, |
| "grad_norm": 0.482837438583374, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 0.0912, |
| "step": 660 |
| }, |
| { |
| "epoch": 4.208860759493671, |
| "grad_norm": 0.7408327460289001, |
| "learning_rate": 3.2061068702290076e-05, |
| "loss": 0.0912, |
| "step": 665 |
| }, |
| { |
| "epoch": 4.2405063291139244, |
| "grad_norm": 0.5170139074325562, |
| "learning_rate": 3.078880407124682e-05, |
| "loss": 0.0958, |
| "step": 670 |
| }, |
| { |
| "epoch": 4.272151898734177, |
| "grad_norm": 0.6944636106491089, |
| "learning_rate": 2.9516539440203562e-05, |
| "loss": 0.0883, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.30379746835443, |
| "grad_norm": 0.6294474601745605, |
| "learning_rate": 2.824427480916031e-05, |
| "loss": 0.0915, |
| "step": 680 |
| }, |
| { |
| "epoch": 4.3354430379746836, |
| "grad_norm": 0.6630133390426636, |
| "learning_rate": 2.6972010178117052e-05, |
| "loss": 0.0905, |
| "step": 685 |
| }, |
| { |
| "epoch": 4.367088607594937, |
| "grad_norm": 0.6174125075340271, |
| "learning_rate": 2.5699745547073793e-05, |
| "loss": 0.0958, |
| "step": 690 |
| }, |
| { |
| "epoch": 4.39873417721519, |
| "grad_norm": 0.7214713096618652, |
| "learning_rate": 2.4427480916030535e-05, |
| "loss": 0.0905, |
| "step": 695 |
| }, |
| { |
| "epoch": 4.430379746835443, |
| "grad_norm": 0.7950146794319153, |
| "learning_rate": 2.3155216284987276e-05, |
| "loss": 0.0925, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.462025316455696, |
| "grad_norm": 0.6609070301055908, |
| "learning_rate": 2.1882951653944024e-05, |
| "loss": 0.0929, |
| "step": 705 |
| }, |
| { |
| "epoch": 4.493670886075949, |
| "grad_norm": 0.6648293733596802, |
| "learning_rate": 2.0610687022900766e-05, |
| "loss": 0.0908, |
| "step": 710 |
| }, |
| { |
| "epoch": 4.525316455696203, |
| "grad_norm": 0.5198394656181335, |
| "learning_rate": 1.9338422391857507e-05, |
| "loss": 0.0907, |
| "step": 715 |
| }, |
| { |
| "epoch": 4.556962025316456, |
| "grad_norm": 0.8868843913078308, |
| "learning_rate": 1.8066157760814252e-05, |
| "loss": 0.0957, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.588607594936709, |
| "grad_norm": 0.6488995552062988, |
| "learning_rate": 1.6793893129770993e-05, |
| "loss": 0.0898, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.620253164556962, |
| "grad_norm": 0.5500432252883911, |
| "learning_rate": 1.5521628498727735e-05, |
| "loss": 0.0946, |
| "step": 730 |
| }, |
| { |
| "epoch": 4.651898734177215, |
| "grad_norm": 0.8371357321739197, |
| "learning_rate": 1.424936386768448e-05, |
| "loss": 0.0905, |
| "step": 735 |
| }, |
| { |
| "epoch": 4.6835443037974684, |
| "grad_norm": 0.5861048102378845, |
| "learning_rate": 1.2977099236641221e-05, |
| "loss": 0.0941, |
| "step": 740 |
| }, |
| { |
| "epoch": 4.715189873417722, |
| "grad_norm": 0.7422693371772766, |
| "learning_rate": 1.1704834605597966e-05, |
| "loss": 0.0956, |
| "step": 745 |
| }, |
| { |
| "epoch": 4.746835443037975, |
| "grad_norm": 0.5376149415969849, |
| "learning_rate": 1.0432569974554709e-05, |
| "loss": 0.0937, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.7784810126582276, |
| "grad_norm": 0.46256035566329956, |
| "learning_rate": 9.16030534351145e-06, |
| "loss": 0.0878, |
| "step": 755 |
| }, |
| { |
| "epoch": 4.810126582278481, |
| "grad_norm": 0.4410872459411621, |
| "learning_rate": 7.888040712468193e-06, |
| "loss": 0.0929, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.841772151898734, |
| "grad_norm": 0.44353851675987244, |
| "learning_rate": 6.615776081424936e-06, |
| "loss": 0.0909, |
| "step": 765 |
| }, |
| { |
| "epoch": 4.8734177215189876, |
| "grad_norm": 0.5728248953819275, |
| "learning_rate": 5.343511450381679e-06, |
| "loss": 0.0946, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.905063291139241, |
| "grad_norm": 0.4497832655906677, |
| "learning_rate": 4.0712468193384225e-06, |
| "loss": 0.089, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.936708860759493, |
| "grad_norm": 0.650806725025177, |
| "learning_rate": 2.7989821882951656e-06, |
| "loss": 0.0939, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.968354430379747, |
| "grad_norm": 0.5624284148216248, |
| "learning_rate": 1.5267175572519084e-06, |
| "loss": 0.0918, |
| "step": 785 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.5814446210861206, |
| "learning_rate": 2.544529262086514e-07, |
| "loss": 0.0872, |
| "step": 790 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 790, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.503292302504755e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|